/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.benchmark.jmh;

import java.util.Arrays;
import java.util.SplittableRandom;
import java.util.concurrent.TimeUnit;
import java.util.function.IntSupplier;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.VectorUtil;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(
    value = 1,
    jvmArgsAppend = {
      "-Xmx1g",
      "-Xms1g",
      "-XX:+AlwaysPreTouch",
      "--add-modules",
      "jdk.incubator.vector"
    })
public class CompetitiveBenchmark {

  private final SplittableRandom R = new SplittableRandom(0);

  @Param("128")
  int size;

  double[] scores;
  int[] docs;

  // scores generated by nextDouble() locate in range [0, 1), so we can tune this parameter and
  // see how the performance changes depends on how selective the filter is.
  @Param({"0", "0.2", "0.4", "0.5", "0.8"})
  double minScoreInclusive;

  @Setup(Level.Trial)
  public void setUpTrial() {
    scores = new double[size];
    docs = new int[size];
  }

  @Setup(Level.Invocation)
  public void setUpInvocation() {
    for (int i = 0; i < size; i++) {
      docs[i] = R.nextInt(Integer.MAX_VALUE);
      scores[i] = R.nextDouble();
    }
  }

  @Benchmark
  public int baseline() {
    int newSize = 0;
    for (int i = 0; i < size; ++i) {
      if (scores[i] >= minScoreInclusive) {
        docs[newSize] = docs[i];
        scores[newSize] = scores[i];
        newSize++;
      }
    }
    return newSize;
  }

  @Benchmark
  public int branchlessCandidate() {
    int newSize = 0;
    for (int i = 0; i < size; ++i) {
      int inc = scores[i] >= minScoreInclusive ? 1 : 0;
      docs[newSize] = docs[i];
      scores[newSize] = scores[i];
      newSize += inc;
    }
    return newSize;
  }

  // This is an effort try to make the modification of newSize using cmov
  // see https://github.com/apache/lucene/pull/14906
  @Benchmark
  public int branchlessCandidateCmov() {
    int newSize = 0;
    for (int i = 0; i < size; ++i) {
      int doc = docs[i];
      double score = scores[i];
      docs[newSize] = doc;
      scores[newSize] = score;
      if (score >= minScoreInclusive) {
        newSize++;
      }
    }
    return newSize;
  }

  @Benchmark
  public int vectorizedCandidate() {
    return VectorUtil.filterByScore(docs, scores, minScoreInclusive, size);
  }

  public static void main(String[] args) {
    CompetitiveBenchmark baseline = new CompetitiveBenchmark();
    baseline.size = 128;
    baseline.setUpTrial();
    baseline.setUpInvocation();
    int baselineSize = baseline.baseline();

    CompetitiveBenchmark candidate = new CompetitiveBenchmark();
    candidate.size = 128;
    candidate.setUpTrial();
    candidate.setUpInvocation();

    for (IntSupplier s :
        new IntSupplier[] {
          candidate::branchlessCandidate,
          candidate::vectorizedCandidate,
          candidate::branchlessCandidateCmov
        }) {

      int candidateSize = s.getAsInt();

      if (baselineSize != candidateSize) {
        throw new IllegalArgumentException("incorrect size");
      }

      if (Arrays.equals(baseline.docs, 0, baselineSize, candidate.docs, 0, candidateSize)
          == false) {
        throw new IllegalArgumentException(
            "incorrect docs,"
                + "\nbaseline: "
                + Arrays.toString(ArrayUtil.copyOfSubArray(baseline.docs, 0, baselineSize))
                + "\ncandidate: "
                + Arrays.toString(ArrayUtil.copyOfSubArray(candidate.docs, 0, candidateSize)));
      }

      if (Arrays.equals(baseline.scores, 0, baselineSize, candidate.scores, 0, candidateSize)
          == false) {
        throw new IllegalArgumentException("incorrect scores");
      }
    }
  }
}
